The Company
A bike-share program that features more than 5,800 bicycles and 600 docking stations. It sets itself apart by also offering reclining bikes, hand tricycles, and cargo bikes, making bike-share more inclusive to people with disabilities and riders who can’t use a standard two-wheeled bike.
The Users
Two different types of users are classified:
The Goal
The Department of Marketing wants to design strategies aimed at converting casual riders into annual members.
The Method
In order to accomplish the goal, the Marketing Analyst team needs to better understand how annual members and casual riders differ, why casual riders would buy a membership, and how digital media could affect their marketing tactics.
We have to analyze historical trips data to identify characteristics, trends and connections regarding the bike usage from Members and Casual users. The results will be used by the stakeholders to develop and approve an appropriate marketing strategy.
Stakeholders
The stakeholders are:
The data has been made available by Motivate International Inc. under this license.
This is public data that we can use to explore but data-privacy issues prohibit us from using riders’ personally identifiable information.
The data to analyze covers from January to December 2021 in more than 5 million records. It is reliable and can provide relevant information to the business task.
Package loading
# Load necessary packages (previously installed).
library(tidyverse) # to wrangle data
library(dplyr) # to manipulate data
library(lubridate) # to parse and manipulate dates
library(sqldf) # to perform SQL queries
library(ggplot2) # to visualize data
library(cowplot) # to improve visualizations
library(wordcloud) # to improve visualizations
library(ggwordcloud) # to improve visualizations
library(ggmap) # to work with maps
# Results hidden
Data loading
# Load original data.
trips_2021_01 <- read.csv("Datos/202101-divvy-tripdata.csv")
trips_2021_02 <- read.csv("Datos/202102-divvy-tripdata.csv")
trips_2021_03 <- read.csv("Datos/202103-divvy-tripdata.csv")
trips_2021_04 <- read.csv("Datos/202104-divvy-tripdata.csv")
trips_2021_05 <- read.csv("Datos/202105-divvy-tripdata.csv")
trips_2021_06 <- read.csv("Datos/202106-divvy-tripdata.csv")
trips_2021_07 <- read.csv("Datos/202107-divvy-tripdata.csv")
trips_2021_08 <- read.csv("Datos/202108-divvy-tripdata.csv")
trips_2021_09 <- read.csv("Datos/202109-divvy-tripdata.csv")
trips_2021_10 <- read.csv("Datos/202110-divvy-tripdata.csv")
trips_2021_11 <- read.csv("Datos/202111-divvy-tripdata.csv")
trips_2021_12 <- read.csv("Datos/202112-divvy-tripdata.csv")
Check consistency
# Check column names
colnames(trips_2021_01)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_02)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_03)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_04)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_05)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_06)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_07)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_08)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_09)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_10)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_11)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(trips_2021_12)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
# Check structure
str(trips_2021_01)
## 'data.frame': 96834 obs. of 13 variables:
## $ ride_id : chr "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-01-23 16:14:19" "2021-01-27 18:43:08" "2021-01-21 22:35:54" "2021-01-07 13:31:13" ...
## $ ended_at : chr "2021-01-23 16:24:44" "2021-01-27 18:47:12" "2021-01-21 22:37:14" "2021-01-07 13:42:55" ...
## $ start_station_name: chr "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
## $ start_station_id : chr "17660" "17660" "17660" "17660" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr "member" "member" "member" "member" ...
str(trips_2021_02)
## 'data.frame': 49622 obs. of 13 variables:
## $ ride_id : chr "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : chr "2021-02-12 16:14:56" "2021-02-14 17:52:38" "2021-02-09 19:10:18" "2021-02-02 17:49:41" ...
## $ ended_at : chr "2021-02-12 16:21:43" "2021-02-14 18:12:09" "2021-02-09 19:19:10" "2021-02-02 17:54:06" ...
## $ start_station_name: chr "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ start_station_id : chr "525" "525" "KA1503000012" "637" ...
## $ end_station_name : chr "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ end_station_id : chr "660" "16806" "TA1305000029" "TA1305000034" ...
## $ start_lat : num 42 42 41.9 41.9 41.8 ...
## $ start_lng : num -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num 42 42 41.9 41.9 41.8 ...
## $ end_lng : num -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr "member" "casual" "member" "member" ...
str(trips_2021_03)
## 'data.frame': 228496 obs. of 13 variables:
## $ ride_id : chr "CFA86D4455AA1030" "30D9DC61227D1AF3" "846D87A15682A284" "994D05AA75A168F2" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2021-03-16 08:32:30" "2021-03-28 01:26:28" "2021-03-11 21:17:29" "2021-03-11 13:26:42" ...
## $ ended_at : chr "2021-03-16 08:36:34" "2021-03-28 01:36:55" "2021-03-11 21:33:53" "2021-03-11 13:55:41" ...
## $ start_station_name: chr "Humboldt Blvd & Armitage Ave" "Humboldt Blvd & Armitage Ave" "Shields Ave & 28th Pl" "Winthrop Ave & Lawrence Ave" ...
## $ start_station_id : chr "15651" "15651" "15443" "TA1308000021" ...
## $ end_station_name : chr "Stave St & Armitage Ave" "Central Park Ave & Bloomingdale Ave" "Halsted St & 35th St" "Broadway & Sheridan Rd" ...
## $ end_station_id : chr "13266" "18017" "TA1308000043" "13323" ...
## $ start_lat : num 41.9 41.9 41.8 42 42 ...
## $ start_lng : num -87.7 -87.7 -87.6 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.9 41.8 42 42.1 ...
## $ end_lng : num -87.7 -87.7 -87.6 -87.6 -87.7 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
str(trips_2021_04)
## 'data.frame': 337230 obs. of 13 variables:
## $ ride_id : chr "6C992BD37A98A63F" "1E0145613A209000" "E498E15508A80BAD" "1887262AD101C604" ...
## $ rideable_type : chr "classic_bike" "docked_bike" "docked_bike" "classic_bike" ...
## $ started_at : chr "2021-04-12 18:25:36" "2021-04-27 17:27:11" "2021-04-03 12:42:45" "2021-04-17 09:17:42" ...
## $ ended_at : chr "2021-04-12 18:56:55" "2021-04-27 18:31:29" "2021-04-07 11:40:24" "2021-04-17 09:42:48" ...
## $ start_station_name: chr "State St & Pearson St" "Dorchester Ave & 49th St" "Loomis Blvd & 84th St" "Honore St & Division St" ...
## $ start_station_id : chr "TA1307000061" "KA1503000069" "20121" "TA1305000034" ...
## $ end_station_name : chr "Southport Ave & Waveland Ave" "Dorchester Ave & 49th St" "Loomis Blvd & 84th St" "Southport Ave & Waveland Ave" ...
## $ end_station_id : chr "13235" "KA1503000069" "20121" "13235" ...
## $ start_lat : num 41.9 41.8 41.7 41.9 41.7 ...
## $ start_lng : num -87.6 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.8 41.7 41.9 41.7 ...
## $ end_lng : num -87.7 -87.6 -87.7 -87.7 -87.7 ...
## $ member_casual : chr "member" "casual" "casual" "member" ...
str(trips_2021_05)
## 'data.frame': 531633 obs. of 13 variables:
## $ ride_id : chr "C809ED75D6160B2A" "DD59FDCE0ACACAF3" "0AB83CB88C43EFC2" "7881AC6D39110C60" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-05-30 11:58:15" "2021-05-30 11:29:14" "2021-05-30 14:24:01" "2021-05-30 14:25:51" ...
## $ ended_at : chr "2021-05-30 12:10:39" "2021-05-30 12:14:09" "2021-05-30 14:25:13" "2021-05-30 14:41:04" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.6 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.8 41.9 41.9 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.7 -87.7 -87.7 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
str(trips_2021_06)
## 'data.frame': 729595 obs. of 13 variables:
## $ ride_id : chr "99FEC93BA843FB20" "06048DCFC8520CAF" "9598066F68045DF2" "B03C0FE48C412214" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-06-13 14:31:28" "2021-06-04 11:18:02" "2021-06-04 09:49:35" "2021-06-03 19:56:05" ...
## $ ended_at : chr "2021-06-13 14:34:11" "2021-06-04 11:24:19" "2021-06-04 09:55:34" "2021-06-03 20:21:55" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.8 41.8 41.8 41.8 41.8 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 41.8 41.8 41.8 41.8 41.8 ...
## $ end_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ member_casual : chr "member" "member" "member" "member" ...
str(trips_2021_07)
## 'data.frame': 822410 obs. of 13 variables:
## $ ride_id : chr "0A1B623926EF4E16" "B2D5583A5A5E76EE" "6F264597DDBF427A" "379B58EAB20E8AA5" ...
## $ rideable_type : chr "docked_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2021-07-02 14:44:36" "2021-07-07 16:57:42" "2021-07-25 11:30:55" "2021-07-08 22:08:30" ...
## $ ended_at : chr "2021-07-02 15:19:58" "2021-07-07 17:16:09" "2021-07-25 11:48:45" "2021-07-08 22:23:32" ...
## $ start_station_name: chr "Michigan Ave & Washington St" "California Ave & Cortez St" "Wabash Ave & 16th St" "California Ave & Cortez St" ...
## $ start_station_id : chr "13001" "17660" "SL-012" "17660" ...
## $ end_station_name : chr "Halsted St & North Branch St" "Wood St & Hubbard St" "Rush St & Hubbard St" "Carpenter St & Huron St" ...
## $ end_station_id : chr "KA1504000117" "13432" "KA1503000044" "13196" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.6 -87.7 -87.6 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num -87.6 -87.7 -87.6 -87.7 -87.7 ...
## $ member_casual : chr "casual" "casual" "member" "member" ...
str(trips_2021_08)
## 'data.frame': 804352 obs. of 13 variables:
## $ ride_id : chr "99103BB87CC6C1BB" "EAFCCCFB0A3FC5A1" "9EF4F46C57AD234D" "5834D3208BFAF1DA" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-08-10 17:15:49" "2021-08-10 17:23:14" "2021-08-21 02:34:23" "2021-08-21 06:52:55" ...
## $ ended_at : chr "2021-08-10 17:22:44" "2021-08-10 17:39:24" "2021-08-21 02:50:36" "2021-08-21 07:08:13" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.8 41.8 42 42 41.8 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ end_lat : num 41.8 41.8 42 42 41.8 ...
## $ end_lng : num -87.7 -87.6 -87.7 -87.7 -87.6 ...
## $ member_casual : chr "member" "member" "member" "member" ...
str(trips_2021_09)
## 'data.frame': 756147 obs. of 13 variables:
## $ ride_id : chr "9DC7B962304CBFD8" "F930E2C6872D6B32" "6EF72137900BB910" "78D1DE133B3DBF55" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-09-28 16:07:10" "2021-09-28 14:24:51" "2021-09-28 00:20:16" "2021-09-28 14:51:17" ...
## $ ended_at : chr "2021-09-28 16:09:54" "2021-09-28 14:40:05" "2021-09-28 00:23:57" "2021-09-28 15:00:06" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 41.9 41.8 41.8 41.9 ...
## $ start_lng : num -87.7 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num 41.9 42 41.8 41.8 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
str(trips_2021_10)
## 'data.frame': 631226 obs. of 13 variables:
## $ ride_id : chr "620BC6107255BF4C" "4471C70731AB2E45" "26CA69D43D15EE14" "362947F0437E1514" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-10-22 12:46:42" "2021-10-21 09:12:37" "2021-10-16 16:28:39" "2021-10-16 16:17:48" ...
## $ ended_at : chr "2021-10-22 12:49:50" "2021-10-21 09:14:14" "2021-10-16 16:36:26" "2021-10-16 16:19:03" ...
## $ start_station_name: chr "Kingsbury St & Kinzie St" "" "" "" ...
## $ start_station_id : chr "KA1503000043" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.6 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num -87.6 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr "member" "member" "member" "member" ...
str(trips_2021_11)
## 'data.frame': 359978 obs. of 13 variables:
## $ ride_id : chr "7C00A93E10556E47" "90854840DFD508BA" "0A7D10CDD144061C" "2F3BE33085BCFF02" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-11-27 13:27:38" "2021-11-27 13:38:25" "2021-11-26 22:03:34" "2021-11-27 09:56:49" ...
## $ ended_at : chr "2021-11-27 13:46:38" "2021-11-27 13:56:10" "2021-11-26 22:05:56" "2021-11-27 10:01:50" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 42 42 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.8 -87.6 ...
## $ end_lat : num 42 41.9 42 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.8 -87.6 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
str(trips_2021_12)
## 'data.frame': 247540 obs. of 13 variables:
## $ ride_id : chr "46F8167220E4431F" "73A77762838B32FD" "4CF42452054F59C5" "3278BA87BF698339" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
## $ started_at : chr "2021-12-07 15:06:07" "2021-12-11 03:43:29" "2021-12-15 23:10:28" "2021-12-26 16:16:10" ...
## $ ended_at : chr "2021-12-07 15:13:42" "2021-12-11 04:10:23" "2021-12-15 23:23:14" "2021-12-26 16:30:53" ...
## $ start_station_name: chr "Laflin St & Cullerton St" "LaSalle Dr & Huron St" "Halsted St & North Branch St" "Halsted St & North Branch St" ...
## $ start_station_id : chr "13307" "KP1705001026" "KA1504000117" "KA1504000117" ...
## $ end_station_name : chr "Morgan St & Polk St" "Clarendon Ave & Leland Ave" "Broadway & Barry Ave" "LaSalle Dr & Huron St" ...
## $ end_station_id : chr "TA1307000130" "TA1307000119" "13137" "KP1705001026" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num 41.9 42 41.9 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.6 -87.6 -87.6 ...
## $ member_casual : chr "member" "casual" "member" "member" ...
Conclusions so far:
Column names are consistent across all files.
Structure is consistent too. No incongruences were found.
However, it could be necessary to adjust the type of some columns such as started_at and ended_at from char to datetime.
First, and given the previous results, we can join the twelve data frames into a single one.
trips_2021 <- bind_rows(trips_2021_01, trips_2021_02, trips_2021_03, trips_2021_04, trips_2021_05, trips_2021_06, trips_2021_07, trips_2021_08, trips_2021_09, trips_2021_10, trips_2021_11, trips_2021_12)
Check new Data Frame
colnames(trips_2021) # column names
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
nrow(trips_2021) # qty of records
## [1] 5595063
dim(trips_2021) # dimensions
## [1] 5595063 13
head(trips_2021) # first rows
## ride_id rideable_type started_at ended_at
## 1 E19E6F1B8D4C42ED electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44
## 2 DC88F20C2C55F27F electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12
## 3 EC45C94683FE3F27 electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14
## 4 4FA453A75AE377DB electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55
## 5 BE5E8EB4E7263A0B electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45
## 6 5D8969F88C773979 electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54
## start_station_name start_station_id end_station_name end_station_id
## 1 California Ave & Cortez St 17660
## 2 California Ave & Cortez St 17660
## 3 California Ave & Cortez St 17660
## 4 California Ave & Cortez St 17660
## 5 California Ave & Cortez St 17660
## 6 California Ave & Cortez St 17660
## start_lat start_lng end_lat end_lng member_casual
## 1 41.90034 -87.69674 41.89 -87.72 member
## 2 41.90033 -87.69671 41.90 -87.69 member
## 3 41.90031 -87.69664 41.90 -87.70 member
## 4 41.90040 -87.69666 41.92 -87.69 member
## 5 41.90033 -87.69670 41.90 -87.70 casual
## 6 41.90041 -87.69676 41.94 -87.71 casual
str(trips_2021) # structure
## 'data.frame': 5595063 obs. of 13 variables:
## $ ride_id : chr "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-01-23 16:14:19" "2021-01-27 18:43:08" "2021-01-21 22:35:54" "2021-01-07 13:31:13" ...
## $ ended_at : chr "2021-01-23 16:24:44" "2021-01-27 18:47:12" "2021-01-21 22:37:14" "2021-01-07 13:42:55" ...
## $ start_station_name: chr "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
## $ start_station_id : chr "17660" "17660" "17660" "17660" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr "member" "member" "member" "member" ...
summary(trips_2021) # summary
## ride_id rideable_type started_at ended_at
## Length:5595063 Length:5595063 Length:5595063 Length:5595063
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:5595063 Length:5595063 Length:5595063 Length:5595063
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.64 Min. :-87.84 Min. :41.39 Min. :-88.97
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.65 Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.07 Max. :-87.52 Max. :42.17 Max. :-87.49
## NA's :4771 NA's :4771
## member_casual
## Length:5595063
## Class :character
## Mode :character
##
##
##
##
filter(trips_2021, rideable_type == "") %>% # no rideable type registered
count()
## n
## 1 0
filter(trips_2021, started_at == "") %>% # no starting time registered
count()
## n
## 1 0
filter(trips_2021, ended_at == "") %>% # no ending time registered
count()
## n
## 1 0
filter(trips_2021, member_casual == "") %>% # no member/casual registered
count()
## n
## 1 0
filter(trips_2021, start_station_id == "") %>% # no start station id registered
count()
## n
## 1 690806
filter(trips_2021, start_station_name == "") %>% # no start station name registered
count()
## n
## 1 690809
filter(trips_2021, end_station_id == "") %>% # no end station id registered
count()
## n
## 1 739170
filter(trips_2021, end_station_name == "") %>% # no end station name registered
count()
## n
## 1 739170
Some details:
More than 690000 rows with start_station_id and start_station_name unregistered.
More than 739000 rows with end_station_id and end_station_name unregistered.
More than 4700 rows with end_lat and end_lng unregistered.
No rows with empty values of rideable_type, started_at, ended_at or member_casual.
As mentioned before we want to convert the data type for started_at and ended_at from char to datetime.
Also, we need to create new columns for the sake of the analysis, such as:
trip_duration (ride length in minutes);
month (January, February, etc.);
day_of_week (Monday, Tuesday, etc.);
time_of_day (time of the day when trips start).
# Columns started_at and ended_at to datetime
trips_2021$started_at <- as_datetime(trips_2021$started_at)
trips_2021$ended_at <- as_datetime(trips_2021$ended_at)
# Trip duration in minutes
trips_2021 <- trips_2021 %>%
mutate(trip_duration = difftime(trips_2021$ended_at, trips_2021$started_at, units="mins"))
max(trips_2021$trip_duration) # longest trip
## Time difference of 55944.15 mins
min(trips_2021$trip_duration) # shortest trip
## Time difference of -58.03333 mins
filter(trips_2021, trip_duration <= 0) %>% # trips with negative or zero duration
count()
## n
## 1 653
# Month
trips_2021 <- trips_2021 %>%
mutate(month = format(trips_2021$started_at, "%b"))
trips_2021 <- trips_2021 %>% # convert to English without changing Date Local
mutate(month = recode(month,
"Ene." = "Jan",
"Feb." = "Feb",
"Mar." = "Mar",
"Abr." = "Apr",
"May." = "May",
"Jun." = "Jun",
"Jul." = "Jul",
"Ago." = "Aug",
"Set." = "Sep",
"Oct." = "Oct",
"Nov." = "Nov",
"Dic." = "Dec"))
# Day of week
trips_2021 <- trips_2021 %>%
mutate(day_of_week = format(trips_2021$started_at, "%A"))
trips_2021 <- trips_2021 %>% # convert to English without changing Date Local
mutate(day_of_week = recode(day_of_week,
"lunes" = "Monday",
"martes" = "Tuesday",
"miércoles" = "Wednesday",
"jueves" = "Thursday",
"viernes" = "Friday",
"sábado" = "Saturday",
"domingo" = "Sunday"))
# Time of the day (when trips start)
trips_2021 <- trips_2021 %>%
mutate(time_of_day = format(trips_2021$started_at, "%H:%M"))
More details:
The longest trip is about 38 days, and there are 653 trips with negative or zero duration.
This is added to the facts depicted in the previous section.
In real life we should check with the stakeholders and the data team to understand why this is happening.
Since we are not able to consult them, we have to make a decision. Hence:
trips with zero or negative duration will be removed because they have no sense or use;
the rest of the data will be kept for now, because we do not have enough information to establish an appropriate criterion to delete them or not.
# Drop NA
trips <- drop_na(trips_2021) # new DF since data is being removed
# Delete trips with zero or negative duration
trips <- trips[!(trips$trip_duration <= 0),]
filter(trips, trip_duration <= 0) %>% # check trips with negative or zero duration = 0
count()
## n
## 1 0
Trip Duration and Number of Rides
# Trip Duration summary
summary(as.numeric(trips$trip_duration))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02 6.75 11.98 20.98 21.73 55944.15
# Grouping by Member/Casual
trips_mem_cas <- trips %>% group_by(member_casual) %>% drop_na() # temporary variable
trips_mem_cas %>% summarize(mean_duration=mean(trip_duration)) # mean comparison
## # A tibble: 2 x 2
## member_casual mean_duration
## <chr> <drtn>
## 1 casual 30.24141 mins
## 2 member 13.35578 mins
trips_mem_cas %>% summarize(median_duration=median(trip_duration)) # median comparison
## # A tibble: 2 x 2
## member_casual median_duration
## <chr> <drtn>
## 1 casual 15.95 mins
## 2 member 9.60 mins
trips_mem_cas %>% summarize(max_duration=max(trip_duration)) # max value comparison
## # A tibble: 2 x 2
## member_casual max_duration
## <chr> <drtn>
## 1 casual 55944.150 mins
## 2 member 1499.933 mins
trips_mem_cas %>% summarize(min_duration=min(trip_duration)) # min value comparison
## # A tibble: 2 x 2
## member_casual min_duration
## <chr> <drtn>
## 1 casual 0.01666667 mins
## 2 member 0.01666667 mins
by_qty <- trips %>%
group_by(member_casual) %>% # quantity and duration comparison
summarise(number_of_rides = n(), average_duration = mean(trip_duration)) %>%
mutate(percentage_of_rides = 100 * number_of_rides / sum(number_of_rides), "%") %>%
print()
## # A tibble: 2 x 5
## member_casual number_of_rides average_duration percentage_of_rides `"%"`
## <chr> <int> <drtn> <dbl> <chr>
## 1 casual 2525174 30.24141 mins 45.2 %
## 2 member 3064466 13.35578 mins 54.8 %
# Grouping by Member/Casual and Month
by_month <- trips %>%
group_by(member_casual, month) %>%
summarise(number_of_rides = n(), average_duration = mean(trip_duration)) %>% # count trips and calculate avg duration by month
arrange(member_casual, -number_of_rides) %>%
print(n = 24)
## # A tibble: 24 x 4
## # Groups: member_casual [2]
## member_casual month number_of_rides average_duration
## <chr> <chr> <int> <drtn>
## 1 casual Jul 441428 31.43849 mins
## 2 casual Aug 412047 27.42168 mins
## 3 casual Jun 370111 35.64996 mins
## 4 casual Sep 363417 26.67066 mins
## 5 casual Oct 256787 24.17718 mins
## 6 casual May 256549 36.98005 mins
## 7 casual Apr 136403 36.61120 mins
## 8 casual Nov 106741 20.08875 mins
## 9 casual Mar 83918 36.88276 mins
## 10 casual Dec 69606 21.28581 mins
## 11 casual Jan 18095 24.46770 mins
## 12 casual Feb 10072 44.45105 mins
## 13 member Sep 392028 13.48432 mins
## 14 member Aug 391492 13.83279 mins
## 15 member Jul 380169 14.00684 mins
## 16 member Oct 373885 12.25872 mins
## 17 member Jun 358701 14.33747 mins
## 18 member May 274578 14.42756 mins
## 19 member Nov 252960 11.08939 mins
## 20 member Apr 200522 14.45630 mins
## 21 member Dec 177769 10.83773 mins
## 22 member Mar 144399 13.79401 mins
## 23 member Jan 78631 12.49477 mins
## 24 member Feb 39332 15.48586 mins
# Grouping by Member/Casual and Day of week
by_day <- trips %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n(), average_duration = mean(trip_duration)) %>% # count trips and calculate avg duration by day
arrange(member_casual, -number_of_rides) %>%
print()
## # A tibble: 14 x 4
## # Groups: member_casual [2]
## member_casual day_of_week number_of_rides average_duration
## <chr> <chr> <int> <drtn>
## 1 casual Saturday 557121 32.77432 mins
## 2 casual Sunday 480300 35.33692 mins
## 3 casual Friday 363542 28.61203 mins
## 4 casual Monday 285959 30.28861 mins
## 5 casual Thursday 285646 26.05190 mins
## 6 casual Wednesday 278578 26.23160 mins
## 7 casual Tuesday 274028 26.71657 mins
## 8 member Wednesday 476961 12.62816 mins
## 9 member Tuesday 465312 12.57173 mins
## 10 member Thursday 451316 12.52827 mins
## 11 member Friday 446180 13.05322 mins
## 12 member Saturday 432808 14.92451 mins
## 13 member Monday 415995 12.94237 mins
## 14 member Sunday 375894 15.25352 mins
# Grouping by Member/Casual and Time of day (hours - 00 to 23)
by_time <- trips %>%
mutate(hour_of_day = substr(time_of_day, 1, 2)) %>%
group_by(member_casual, hour_of_day) %>%
summarise(number_of_rides = n(), average_duration = mean(trip_duration)) %>% # count trips and calculate avg duration by hour
arrange(member_casual, -number_of_rides) %>%
print(n=48)
## # A tibble: 48 x 4
## # Groups: member_casual [2]
## member_casual hour_of_day number_of_rides average_duration
## <chr> <chr> <int> <drtn>
## 1 casual 17 236278 28.28389 mins
## 2 casual 18 213538 28.42006 mins
## 3 casual 16 205045 30.03258 mins
## 4 casual 15 188374 32.46934 mins
## 5 casual 14 178243 33.34981 mins
## 6 casual 13 173182 33.05681 mins
## 7 casual 19 166047 28.58016 mins
## 8 casual 12 161865 32.17344 mins
## 9 casual 11 135734 32.09181 mins
## 10 casual 20 121880 29.81915 mins
## 11 casual 10 104544 31.55252 mins
## 12 casual 21 103614 29.83077 mins
## 13 casual 22 96142 30.22666 mins
## 14 casual 09 76069 27.26587 mins
## 15 casual 23 73949 31.40175 mins
## 16 casual 08 63475 22.58154 mins
## 17 casual 00 53784 31.19633 mins
## 18 casual 07 46678 20.22503 mins
## 19 casual 01 39128 33.79116 mins
## 20 casual 06 25696 19.64747 mins
## 21 casual 02 25457 37.76655 mins
## 22 casual 03 14016 38.12476 mins
## 23 casual 05 12519 21.66969 mins
## 24 casual 04 9917 38.79013 mins
## 25 member 17 320154 14.03325 mins
## 26 member 18 271295 13.85518 mins
## 27 member 16 257423 13.87235 mins
## 28 member 15 200993 13.81801 mins
## 29 member 19 194182 13.62962 mins
## 30 member 12 180567 13.18884 mins
## 31 member 13 177797 13.42395 mins
## 32 member 14 174627 13.87374 mins
## 33 member 08 173161 12.01245 mins
## 34 member 11 155238 13.44880 mins
## 35 member 07 149627 12.15826 mins
## 36 member 20 131567 13.46876 mins
## 37 member 09 130794 12.40158 mins
## 38 member 10 129183 13.26205 mins
## 39 member 21 97152 13.16339 mins
## 40 member 06 81797 12.04041 mins
## 41 member 22 74532 13.30599 mins
## 42 member 23 51644 13.14761 mins
## 43 member 00 33116 12.99434 mins
## 44 member 05 30013 11.50798 mins
## 45 member 01 21882 14.08890 mins
## 46 member 02 12440 13.96086 mins
## 47 member 04 8135 12.74671 mins
## 48 member 03 7147 14.06828 mins
# Grouping by Member/Casual and Rideable Type
by_rideable <- trips %>%
group_by(member_casual, rideable_type) %>%
summarise(number_of_rides = n(), average_duration = mean(trip_duration)) %>%
arrange(member_casual, -number_of_rides) %>%
print()
## # A tibble: 6 x 4
## # Groups: member_casual [2]
## member_casual rideable_type number_of_rides average_duration
## <chr> <chr> <int> <drtn>
## 1 casual classic_bike 1263331 26.248159 mins
## 2 casual electric_bike 949803 19.829425 mins
## 3 casual docked_bike 312040 78.101056 mins
## 4 member classic_bike 1982939 13.717218 mins
## 5 member electric_bike 1081526 12.693101 mins
## 6 member docked_bike 1 2.633333 mins
Top 30 Starting Stations
# Casual users
by_casual_start_st <- sqldf("SELECT start_station_name, COUNT(member_casual) AS casual_riders
FROM trips
WHERE start_station_name != '' and member_casual = 'casual'
GROUP BY start_station_name
ORDER BY casual_riders DESC
LIMIT 30") %>%
print()
## start_station_name casual_riders
## 1 Streeter Dr & Grand Ave 66268
## 2 Millennium Park 33498
## 3 Michigan Ave & Oak St 29746
## 4 Shedd Aquarium 23220
## 5 Theater on the Lake 21322
## 6 Wells St & Concord Ln 19874
## 7 Lake Shore Dr & Monroe St 19589
## 8 Clark St & Lincoln Ave 17016
## 9 Wells St & Elm St 16644
## 10 Indiana Ave & Roosevelt Rd 16603
## 11 Clark St & Elm St 16453
## 12 DuSable Lake Shore Dr & Monroe St 16215
## 13 Clark St & Armitage Ave 16188
## 14 Wabash Ave & Grand Ave 16130
## 15 New St & Illinois St 15388
## 16 Dusable Harbor 15202
## 17 Lake Shore Dr & North Blvd 14836
## 18 DuSable Lake Shore Dr & North Blvd 14785
## 19 Michigan Ave & Lake St 14683
## 20 Michigan Ave & Washington St 14289
## 21 Michigan Ave & 8th St 13624
## 22 Larrabee St & Webster Ave 13323
## 23 Wells St & Evergreen Ave 13154
## 24 Clark St & Newport St 12998
## 25 Wilton Ave & Belmont Ave 12970
## 26 Broadway & Barry Ave 12954
## 27 Fairbanks Ct & Grand Ave 12886
## 28 LaSalle St & Illinois St 12687
## 29 Dearborn St & Erie St 12543
## 30 Buckingham Fountain 12355
# Member users
by_member_start_st <- sqldf("SELECT start_station_name, COUNT(member_casual) AS member_riders
FROM trips
WHERE start_station_name != '' and member_casual = 'member'
GROUP BY start_station_name
ORDER BY member_riders DESC
LIMIT 30") %>%
print()
## start_station_name member_riders
## 1 Clark St & Elm St 24728
## 2 Wells St & Concord Ln 23707
## 3 Kingsbury St & Kinzie St 23551
## 4 Wells St & Elm St 21014
## 5 Dearborn St & Erie St 19579
## 6 Wells St & Huron St 19184
## 7 St. Clair St & Erie St 18889
## 8 Broadway & Barry Ave 17793
## 9 Clinton St & Madison St 16907
## 10 Desplaines St & Kinzie St 16814
## 11 Clark St & Armitage Ave 16696
## 12 Wabash Ave & Grand Ave 16608
## 13 Clark St & Lincoln Ave 16345
## 14 Streeter Dr & Grand Ave 16341
## 15 Green St & Madison St 16004
## 16 Theater on the Lake 15481
## 17 Clinton St & Washington Blvd 15213
## 18 Wells St & Hubbard St 15200
## 19 Wilton Ave & Belmont Ave 14970
## 20 Larrabee St & Webster Ave 14770
## 21 Michigan Ave & Oak St 14561
## 22 Clark St & Wrightwood Ave 14375
## 23 Ashland Ave & Division St 14337
## 24 Ellis Ave & 60th St 14221
## 25 Dearborn Pkwy & Delaware Pl 14152
## 26 Loomis St & Lexington St 13999
## 27 Kingsbury St & Erie St 13944
## 28 Lincoln Ave & Fullerton Ave 13579
## 29 Broadway & Waveland Ave 13574
## 30 Wabash Ave & Roosevelt Rd 13544
# General parameters
casual_color <- "#54d6b8" # green
member_color <- "#ab54d6" # purple
border_panel <- "grey"
border_plot <- "grey"
captions <- "Data from January to December 2021"
theme_options <- theme(plot.caption.position = "plot",
plot.caption = element_text(hjust = 1),
plot.title.position = "panel",
plot.title = element_text(hjust = 0.5),
panel.background = element_rect(color = border_panel),
plot.background = element_rect(color = border_plot))
# Rides Distribution (percentage)
ggplot(by_qty, aes(x=2, y=percentage_of_rides, fill=member_casual)) +
geom_bar(stat="identity", width=0.5, color="white", size=1) +
coord_polar(theta = "y", start=0) +
theme_void() +
labs(title="Rides Distribution",
caption = captions,
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
geom_label(aes(label = paste(member_casual, "\n", round(percentage_of_rides, 2), "%", sep="")),
color = "white",
size = 4.5,
label.size = 0,
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
theme(plot.caption.position = "plot",
plot.caption = element_text(hjust = 1.18, margin = unit(c(0,1,2,0), "mm")),
plot.title.position = "panel",
plot.title = element_text(hjust = 0.5, margin = unit(c(0.5,0,0,0), "cm")),
plot.margin = unit(c(0,1.5,0,1.5), "cm"),
plot.background = element_rect(color = border_plot),
legend.position = c(1, 0.3))
# By quantity
plot_by_qty <- by_qty %>%
ggplot() +
geom_col(aes(x = member_casual, y = number_of_rides, fill = member_casual), width = 0.5) +
labs(title="Total Number of Rides",
x="User",
y="Number of rides",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(plot.caption.position = "plot",
plot.caption = element_text(hjust = 1),
plot.title.position = "panel",
plot.title = element_text(hjust = 0.5),
panel.background = element_rect(color = border_panel))
# By duration
plot_by_dur <- by_qty %>%
ggplot() +
geom_col(aes(x = member_casual, y = as.numeric(average_duration), fill = member_casual), width = 0.5) +
labs(title="Average Duration",
caption = captions,
x="User",
y="Avg. duration (minutes)",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(plot.caption.position = "plot",
plot.caption = element_text(hjust = 1),
plot.title.position = "panel",
plot.title = element_text(hjust = 0.5),
panel.background = element_rect(color = border_panel))
# Plots together
plot_grid(plot_by_qty, plot_by_dur, ncol=2, align = "hv") +
panel_border(color = border_panel)
# By Density
less_than_120 <- select(filter(trips, trips$trip_duration < 120), member_casual, trip_duration) # to be able to visualize density
ggplot(data=less_than_120, aes(x=as.numeric(trip_duration), group=member_casual, fill=member_casual)) +
geom_density(adjust=12, alpha=0.5) +
labs(title="Trip Duration Density by User",
caption = paste("*Considering only rides shorter than 120 minutes", "\n\n", captions, sep=""),
x="Trip duration (minutes)",
y="Density",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
scale_x_continuous(expand = expansion(mult = c(0, 0.00)), n.breaks = 12) +
theme_options
## Please notice that the means shown in this figure are lower than those calculated earlier because we are now only considering a trip duration less than 120 minutes. ##
# By month
by_month$month <- factor(by_month$month, levels = month.abb) # months in chronological order
by_month %>%
ggplot(aes(x = month, y = number_of_rides, fill = member_casual), width = 0.5) +
facet_wrap(~member_casual) +
geom_col(position = "dodge") +
labs(title="Number of Rides by Month",
caption = captions,
x="Month",
y="Number of rides",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 50, vjust = 0.6)) +
theme_options
by_month %>%
ggplot(aes(x = month, y = as.numeric(average_duration), fill = member_casual), width = 0.5) +
facet_wrap(~member_casual) +
geom_col(position = "dodge") +
labs(title="Average Trip Duration by Month",
caption = captions,
x="Month",
y="Avg. duration (minutes)",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 50, vjust = 0.6)) +
theme_options
# By day of week
by_day$day_of_week <- ordered(by_day$day_of_week, levels=c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday")) # days in chronological order
by_day %>%
ggplot(aes(x = day_of_week, y = number_of_rides, fill = member_casual), width = 0.5) +
facet_wrap(~member_casual) +
geom_col(position = "dodge") +
labs(title="Number of Rides by Day",
caption = captions,
x="Day of week",
y="Number of rides",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 50, vjust = 0.6)) +
theme_options
by_day %>%
ggplot(aes(x = day_of_week, y = as.numeric(average_duration), fill = member_casual), width = 0.5) +
facet_wrap(~member_casual) +
geom_col(position = "dodge") +
labs(title="Average Trip Duration by Day",
caption = captions,
x="Day of week",
y="Avg. duration (minutes)",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 50, vjust = 0.6)) +
theme_options
# Comparison in scatter plot
by_day %>%
ggplot(aes(x=day_of_week, y=as.numeric(average_duration), color=member_casual, alpha=0.8)) +
geom_point(show.legend = c(size=FALSE, alpha=FALSE), shape=19, size=6) +
labs(title="Comparison: Average Trip Duration by Day",
caption = captions,
x="Day of week",
y="Avg. duration (minutes)",
color = "") +
scale_color_manual(values = c(casual_color, member_color)) +
scale_y_continuous(n.breaks = 6) +
theme_options
# By time of day
by_time <- arrange(by_time, member_casual, hour_of_day)
by_time %>%
ggplot(aes(x = hour_of_day, y = number_of_rides, fill = member_casual), width = 0.5) +
facet_wrap(~member_casual) +
geom_col(position = "dodge") +
labs(title="Number of Rides by Hour",
caption = captions,
x="Time of day",
y="Number of rides",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6)) +
theme_options
by_time %>%
ggplot(aes(x = hour_of_day, y = as.numeric(average_duration), fill = member_casual), width = 0.5) +
facet_wrap(~member_casual) +
geom_col(position = "dodge") +
labs(title="Average Trip Duration by Hour",
caption = captions,
x="Time of day",
y="Avg. duration (minutes)",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6)) +
theme_options
# By rideable
by_rideable <- by_rideable %>%
mutate(rideable_type = recode(rideable_type,
"classic_bike" = "classic",
"docked_bike" = "docked",
"electric_bike" = "electric"))
plot_rideable_num <- by_rideable %>%
ggplot(aes(x = rideable_type, y = number_of_rides, fill = member_casual), width = 0.5) +
facet_wrap(~member_casual) +
geom_col(position = "dodge") +
labs(title="Number of Rides by Rideable",
x="Rideable type",
y="Number of rides",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 50, vjust = 0.6)) +
theme(plot.caption.position = "plot",
plot.caption = element_text(hjust = 1),
plot.title.position = "panel",
plot.title = element_text(hjust = 0.5),
panel.background = element_rect(color = border_panel))
plot_rideable_avg <- by_rideable %>%
ggplot(aes(x = rideable_type, y = as.numeric(average_duration), fill = member_casual), width = 0.5) +
facet_wrap(~member_casual) +
geom_col(position = "dodge") +
labs(title="Average Trip Duration by Rideable",
caption = captions,
x="Rideable type",
y="Avg. duration (minutes)",
fill = "") +
scale_fill_manual(values = c(casual_color, member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 50, vjust = 0.6)) +
theme(plot.caption.position = "plot",
plot.caption = element_text(hjust = 1),
plot.title.position = "panel",
plot.title = element_text(hjust = 0.5),
panel.background = element_rect(color = border_panel))
plot_grid(plot_rideable_num, plot_rideable_avg, ncol=2, align = "hv") +
panel_border(color = border_panel)
# By day and rideable
by_day_rideable <- trips %>%
group_by(member_casual, rideable_type, day_of_week) %>%
summarise(number_of_rides = n(), average_duration = mean(trip_duration)) %>%
arrange(member_casual, rideable_type, -average_duration)
by_day_rideable$day_of_week <- ordered(by_day_rideable$day_of_week, levels=c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday"))
by_day_rideable <- by_day_rideable %>%
mutate(rideable_type = recode(rideable_type,
"classic_bike" = "classic",
"docked_bike" = "docked",
"electric_bike" = "electric"))
# Average duration
by_day_rideable %>%
ggplot(aes(x=day_of_week, y=as.numeric(average_duration), color=member_casual, alpha=0.9, shape=rideable_type)) +
geom_point(show.legend = c(size=FALSE, alpha=FALSE), size = 4) +
labs(title="Comparison: Average Trip Duration by Day and Rideable",
caption = captions,
x="Day of week",
y="Avg. duration (minutes)",
color = "Color",
shape = "Shape") +
scale_color_manual(values = c(casual_color, member_color)) +
scale_shape_manual(values = c(18, 15, 17)) +
scale_y_continuous(n.breaks = 6) +
theme_options
# Number of rides
by_day_rideable %>%
ggplot(aes(x=day_of_week, y=number_of_rides, color=member_casual, alpha=0.9, shape=rideable_type)) +
geom_point(show.legend = c(size=FALSE, alpha=FALSE), size = 4) +
labs(title="Comparison: Number of Rides by Day and Rideable",
caption = captions,
x="Day of week",
y="Number of rides",
color = "Color",
shape = "Shape") +
scale_color_manual(values = c(casual_color, member_color)) +
scale_shape_manual(values = c(18, 15, 17)) +
theme_options
Most used starting stations and starting positions
# Wordclouds
set.seed(111)
wc_cas <- ggplot(by_casual_start_st, aes(label = start_station_name, size = casual_riders)) +
geom_text_wordcloud(area_corr = TRUE, color = casual_color, alpha = seq(0.96,0.08,-0.03)) +
scale_size_area(max_size = 6) +
theme(panel.background = element_rect(color = "white", fill = "white"))
set.seed(222)
wc_mem <- ggplot(by_member_start_st, aes(label = start_station_name, size = member_riders, colors = member_color)) +
geom_text_wordcloud(area_corr = TRUE, color = member_color, alpha = seq(0.96,0.08,-0.03)) +
scale_size_area(max_size = 5.5) +
theme(panel.background = element_rect(color = "white", fill = "white"))
plot_grid(wc_cas, wc_mem, ncol=2, align = "hv", rel_widths = c(1,1)) +
panel_border(color = "white")
# Top ten stations - Casual users
by_casual_start_st_10 <- sqldf("SELECT * FROM by_casual_start_st LIMIT 10")
by_casual_start_st_10 %>%
ggplot(aes(x = reorder(start_station_name, -casual_riders), y = casual_riders, fill = casual_color), width = 0.5) +
geom_col(position = "dodge", alpha = seq(0.92,0.2,-0.08)) +
labs(title="Top ten starting stations for Casual users",
caption = captions,
x="Station",
y="Number of rides",
fill = "") +
scale_fill_manual(values = c(casual_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 50, vjust = 0.6)) +
theme(legend.position = "None", plot.margin = unit(c(5,8,5,8), "mm")) +
theme_options
# Top ten stations - Member users
by_member_start_st_10 <- sqldf("SELECT * FROM by_member_start_st LIMIT 10")
by_member_start_st_10 %>%
ggplot(aes(x = reorder(start_station_name, -member_riders), y = member_riders, fill = member_color), width = 0.5) +
geom_col(position = "dodge", alpha = seq(0.92,0.2,-0.08)) +
labs(title="Top 10 starting stations for Member users",
caption = captions,
x="Station",
y="Number of rides",
fill = "") +
scale_fill_manual(values = c(member_color)) +
scale_y_continuous(expand = expansion(mult = c(0.01, 0.05))) +
theme(axis.text.x = element_text(angle = 50, vjust = 0.6)) +
theme(legend.position = "None", plot.margin = unit(c(5,8,5,8), "mm")) +
theme_options
# Top 30 Starting Positions
trips_start_position <- trips %>% mutate(unite(trips, 'start_position', start_lat, start_lng, sep = ' '))
by_casual_start_pos <- sqldf("SELECT start_lat, start_lng, start_position, member_casual, COUNT(member_casual) AS riders
FROM trips_start_position
WHERE member_casual = 'casual'
GROUP BY start_position
ORDER BY riders DESC
LIMIT 30")
by_member_start_pos <- sqldf("SELECT start_lat, start_lng, start_position, member_casual, COUNT(member_casual) AS riders
FROM trips_start_position
WHERE member_casual = 'member'
GROUP BY start_position
ORDER BY riders DESC
LIMIT 30")
by_start_position <- bind_rows(by_casual_start_pos, by_member_start_pos)
# Map of Chicago
chicago_bounds <- c(left = -87.8020,
top = 41.9756,
right = -87.5334,
bottom = 41.7771)
chicago_map <- get_stamenmap(bbox = chicago_bounds,
zoom = 12,
maptype = "terrain")
ggmap(chicago_map, darken = c(0.5, "white")) +
geom_point(data = by_start_position,
aes(x = start_lng, y = start_lat, color=member_casual, size = riders),
alpha = .6) +
labs(title="Most popular starting positions",
caption = paste0("\n", captions),
color = "") +
scale_color_manual(values = c(casual_color, member_color)) +
scale_size_continuous(range = c(2,8)) +
theme_map() +
theme(plot.margin = unit(c(6,2,2,20), "mm"),
plot.title = element_text(size = 13, vjust = 3.5, face = "plain"),
plot.caption = element_text(size=9),
legend.text = element_text(size=10.5, vjust = 0.7)) +
guides(size = "none", color = guide_legend(override.aes = list(size=4))) +
theme_options
Number of rides.
Trip duration.
Months.
Day of week.
Time of day.
No significant differences between both type of users, except the average trip duration remains more uniform throughout the day for Member users.
Rideable type.
Starting stations.
Most used stations are not the same for Casual and Member users.
The respective stations where users start their rides are as follows.
| Casual | Member |
|---|---|
| Streeter Dr & Grand Ave | Clark St & Elm St |
| Millennium Park | Wells St & Concord Ln |
| Michigan Ave & Oak St | Kingsbury St & Kinzie St |
| Shedd Aquarium | Wells St & Elm St |
| Theater on the Lake | Dearborn St & Erie St |
| Wells St & Concord Ln | Wells St & Huron St |
| Lake Shore Dr & Monroe St | St. Clair St & Erie St |
| Clark St & Lincoln Ave | Broadway & Barry Ave |
| Wells St & Elm St | Clinton St & Madison St |
| Indiana Ave & Roosevelt Rd | Desplaines St & Kinzie St |
| Clark St & Elm St | Clark St & Armitage Ave |
| DuSable Lake Shore Dr & Monroe St | Wabash Ave & Grand Ave |
| Clark St & Armitage Ave | Clark St & Lincoln Ave |
| Wabash Ave & Grand Ave | Streeter Dr & Grand Ave |
| New St & Illinois St | Green St & Madison St |
| Dusable Harbor | Theater on the Lake |
| Lake Shore Dr & North Blvd | Clinton St & Washington Blvd |
| DuSable Lake Shore Dr & North Blvd | Wells St & Hubbard St |
| Michigan Ave & Lake St | Wilton Ave & Belmont Ave |
| Michigan Ave & Washington St | Larrabee St & Webster Ave |
| Michigan Ave & 8th St | Michigan Ave & Oak St |
| Larrabee St & Webster Ave | Clark St & Wrightwood Ave |
| Wells St & Evergreen Ave | Ashland Ave & Division St |
| Clark St & Newport St | Ellis Ave & 60th St |
| Wilton Ave & Belmont Ave | Dearborn Pkwy & Delaware Pl |
| Broadway & Barry Ave | Loomis St & Lexington St |
| Fairbanks Ct & Grand Ave | Kingsbury St & Erie St |
| LaSalle St & Illinois St | Lincoln Ave & Fullerton Ave |
| Dearborn St & Erie St | Broadway & Waveland Ave |
| Buckingham Fountain | Wabash Ave & Roosevelt Rd |
Starting Latitude/Longitude.
Similarly to starting stations, the most popular starting coordinates are not the same for both groups.
We make this differentiation (station/position) because not every record has the station name information and it is possible that there are trips that do not start in a station, depending on how a trip is defined.
It can be seen that nearly half of the trips are from Casual users. This means that there is an opportunity to carry out the proposed strategy, that is, to turn Casual users into Members.
The information above suggests that differences in trip characteristics between Casual and Member users could be explained by tourism and leisure rides, mostly among Casual users, while Members tend to mainly use bikes to commute or as transportation.
Thus, users who can be targeted for trying to become Members should be those with similar trip habits as Members.
Therefore, the company needs to focus on Casual users whose trips have similar characteristics to the following:
Another way to identify or to find these target users is by location, looking at the starting stations and starting coordinates where Casual and Member user trips concur (matching starting stations and starting positions), specifically considering trips on Monday to Thursday.
As a way to encourage annual subscriptions, the company could offer different promotions and discounts to target users. For example, in the fall and winter seasons, agreements with coffee shops to give free or discounted hot coffee to Member users could be a nice and attractive detail.
The advertising campaign should emphasize the importance and benefits of exercising outdoors, caring for the environment and, secondarily, the use of bicycles as a means of avoiding traffic jams during peak hours.
For future analysis additional data could be collected, such as riders age and purpose of the trip for Casual users, or purpose of the membership for Members.
The main starting stations and starting positions where the greatest number of trips by Casual users meet the conditions described in the previous section are detailed below.
# Starting stations for Target Casual users
target_casual_st <- sqldf("SELECT start_station_name, COUNT(member_casual) AS rides
FROM trips
WHERE member_casual = 'casual' AND
start_station_name != '' AND
rideable_type != 'docked_bike' AND
trip_duration < 20 AND
(day_of_week = 'Monday' OR
day_of_week = 'Tuesday' OR
day_of_week = 'Wednesday' OR
day_of_week = 'Thursday') AND
(month != 'July' AND month != 'August')
GROUP BY start_station_name
ORDER BY rides DESC
LIMIT 30")
# Table 2 - Starting stations for Target Casual users
target_casual_table <- data.frame(select(target_casual_st, Stations = start_station_name))
kable(target_casual_table, caption = "Table II. Top 30 starting stations for Target Casual users.")
| Stations |
|---|
| Streeter Dr & Grand Ave |
| Wells St & Concord Ln |
| Wells St & Elm St |
| Clark St & Elm St |
| Clark St & Lincoln Ave |
| Wells St & Huron St |
| Wells St & Evergreen Ave |
| Clark St & Armitage Ave |
| Kingsbury St & Kinzie St |
| Dearborn St & Erie St |
| Larrabee St & Webster Ave |
| Wilton Ave & Belmont Ave |
| Broadway & Barry Ave |
| Millennium Park |
| Wabash Ave & Grand Ave |
| LaSalle St & Illinois St |
| Ashland Ave & Division St |
| Michigan Ave & Oak St |
| Green St & Madison St |
| Desplaines St & Kinzie St |
| Lincoln Ave & Fullerton Ave |
| Clark St & Newport St |
| Clark St & Drummond Pl |
| Clark St & Wrightwood Ave |
| Dearborn Pkwy & Delaware Pl |
| DuSable Lake Shore Dr & North Blvd |
| Wells St & Hubbard St |
| Halsted St & Roscoe St |
| New St & Illinois St |
| Michigan Ave & Lake St |
# Starting positions for Target Casual users
target_casual_pos <- sqldf("SELECT start_lat, start_lng, start_position, COUNT(member_casual) AS rides
FROM trips_start_position
WHERE member_casual = 'member' AND
start_position != '' AND
rideable_type != 'docked_bike' AND
trip_duration < 20 AND
(day_of_week = 'Monday' OR
day_of_week = 'Tuesday' OR
day_of_week = 'Wednesday' OR
day_of_week = 'Thursday') AND
(month != 'July' AND month != 'August')
GROUP BY start_position
ORDER BY rides DESC
LIMIT 30")
# Map + Wordcloud
plot_target_pos <- ggmap(chicago_map, darken = c(0.4, "black")) +
geom_point(data = target_casual_pos,
aes(x = start_lng, y = start_lat, size = rides),
color = casual_color,
alpha = .6) +
labs(caption = paste0("\n", captions)) +
scale_size_continuous(range = c(2,8)) +
theme_map() +
guides(size = "none") +
theme(plot.caption = element_text(hjust = 1.6, size = 10))
set.seed(333)
wc_target_pos <- ggplot(target_casual_st, aes(label = start_station_name, size = rides)) +
geom_text_wordcloud(area_corr = TRUE, color = casual_color, alpha = seq(0.96,0.08,-0.03)) +
scale_size_area(max_size = 7) +
theme(panel.background = element_rect(color = "white", fill = "white"))
plot_title <- ggdraw() +
draw_label("Most popular starting stations and positions for Target Users",
fontface = 'plain')
plot_grid(NULL, plot_title, wc_target_pos, plot_target_pos, nrow=4, align = "hv",
rel_heights = c(0.05, 0.05, 0.5, 1)) +
panel_border(color = border_panel)
## Please notice that stations and positions do not necessarily match ##
Matching starting stations and starting position.
Even though preferred starting stations and starting positions differ between Casual and Member users, there are several in common, as shown below.
In this case the 40 most popular starting stations and positions for Casual users are compared to the 30 most popular stations and 20 most popular starting positions for Members, considering only trips on Monday to Thursday.
# Matching starting stations
casual_start_st_mon_thu <- sqldf("SELECT start_station_name, COUNT(member_casual) AS riders
FROM trips
WHERE start_station_name != '' AND member_casual = 'casual' AND
(day_of_week = 'Monday' OR
day_of_week = 'Tuesday' OR
day_of_week = 'Wednesday' OR
day_of_week = 'Thursday')
GROUP BY start_station_name
ORDER BY riders DESC
LIMIT 40") # 40 most popular
member_start_st_mon_thu <- sqldf("SELECT start_station_name, COUNT(member_casual) AS riders
FROM trips
WHERE start_station_name != '' AND member_casual = 'member' AND
(day_of_week = 'Monday' OR
day_of_week = 'Tuesday' OR
day_of_week = 'Wednesday' OR
day_of_week = 'Thursday')
GROUP BY start_station_name
ORDER BY riders DESC
LIMIT 30") # 30 most popular
st_left <- casual_start_st_mon_thu
st_right <- member_start_st_mon_thu
st_join <- sqldf('SELECT st_left.start_station_name, st_left.riders AS casual_rides, st_right.riders AS member_rides
FROM st_left
INNER JOIN st_right
ON st_left.start_station_name = st_right.start_station_name')
# Table 3 - Starting stations in common
match_st_table <- data.frame(select(st_join, "Stations" = start_station_name))
kable(match_st_table, caption = "Table III. Top 15 starting stations in common.")
| Stations |
|---|
| Wells St & Concord Ln |
| Wells St & Elm St |
| Clark St & Elm St |
| Clark St & Lincoln Ave |
| Wabash Ave & Grand Ave |
| Clark St & Armitage Ave |
| Dearborn St & Erie St |
| Wells St & Huron St |
| Broadway & Barry Ave |
| St. Clair St & Erie St |
| Kingsbury St & Kinzie St |
| Ashland Ave & Division St |
| Green St & Madison St |
| Columbus Dr & Randolph St |
| Dearborn Pkwy & Delaware Pl |
# Matching starting positions
casual_start_pos_mon_thu <- sqldf("SELECT start_lat, start_lng, start_position, COUNT(member_casual) AS riders
FROM trips_start_position
WHERE member_casual = 'casual' AND
(day_of_week = 'Monday' OR
day_of_week = 'Tuesday' OR
day_of_week = 'Wednesday' OR
day_of_week = 'Thursday')
GROUP BY start_position
ORDER BY riders DESC
LIMIT 40") # 40 most popular
member_start_pos_mon_thu <- sqldf("SELECT start_lat, start_lng, start_position, COUNT(member_casual) AS riders
FROM trips_start_position
WHERE member_casual = 'member' AND
(day_of_week = 'Monday' OR
day_of_week = 'Tuesday' OR
day_of_week = 'Wednesday' OR
day_of_week = 'Thursday')
GROUP BY start_position
ORDER BY riders DESC
LIMIT 20") # 20 most popular
pos_left <- casual_start_pos_mon_thu
pos_right <- member_start_pos_mon_thu
pos_join <- sqldf('SELECT pos_left.start_position, pos_left.start_lat, pos_left.start_lng,
pos_left.riders AS casual_rides, pos_right.riders AS member_rides
FROM pos_left
INNER JOIN pos_right
ON pos_left.start_position = pos_right.start_position')
# Map + Wordcloud
plot_mon_thu <- ggmap(chicago_map, darken = c(0.4, "black")) +
geom_point(data = pos_join,
aes(x = start_lng, y = start_lat, size = casual_rides),
color = casual_color,
alpha = .6) +
labs(caption = paste0("\n", captions)) +
scale_size_continuous(range = c(2,8)) +
theme_map() +
guides(size = "none") +
theme(plot.caption = element_text(hjust = 1.6, size = 10), )
set.seed(333)
wc_cas_mon_thu <- ggplot(st_join, aes(label = start_station_name, size = casual_rides)) +
geom_text_wordcloud(area_corr = TRUE, color = casual_color, alpha = seq(0.96,0.08,-0.06)) +
scale_size_area(max_size = 8) +
theme(panel.background = element_rect(color = "white", fill = "white"))
plot_title <- ggdraw() +
draw_label(paste0("Most popular starting stations and positions for Casual users", "\n",
"matching most popular ones for Member users (Monday to Thursday)"),
fontface = 'plain')
plot_grid(NULL, plot_title, wc_cas_mon_thu, plot_mon_thu, nrow=4, align = "hv",
rel_heights = c(0.08, 0.05, 0.5, 1)) +
panel_border(color = border_panel)
## Please notice that stations and positions do not necessarily match ##
Thank you for reading!
This is my first work, your comments and suggestions will be greatly appreciated!
Following packages were used.
# Citations
citation("knitr")
##
## To cite the 'knitr' package in publications use:
##
## Yihui Xie (2020). knitr: A General-Purpose Package for Dynamic Report
## Generation in R. R package version 1.30.
##
## Yihui Xie (2015) Dynamic Documents with R and knitr. 2nd edition.
## Chapman and Hall/CRC. ISBN 978-1498716963
##
## Yihui Xie (2014) knitr: A Comprehensive Tool for Reproducible
## Research in R. In Victoria Stodden, Friedrich Leisch and Roger D.
## Peng, editors, Implementing Reproducible Computational Research.
## Chapman and Hall/CRC. ISBN 978-1466561595
##
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
citation("tidyverse")
##
## Wickham et al., (2019). Welcome to the tidyverse. Journal of Open
## Source Software, 4(43), 1686, https://doi.org/10.21105/joss.01686
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {Welcome to the {tidyverse}},
## author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
## year = {2019},
## journal = {Journal of Open Source Software},
## volume = {4},
## number = {43},
## pages = {1686},
## doi = {10.21105/joss.01686},
## }
citation("dplyr")
##
## To cite package 'dplyr' in publications use:
##
## Hadley Wickham, Romain François, Lionel Henry and Kirill Müller
## (2021). dplyr: A Grammar of Data Manipulation. R package version
## 1.0.7. https://CRAN.R-project.org/package=dplyr
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {dplyr: A Grammar of Data Manipulation},
## author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller},
## year = {2021},
## note = {R package version 1.0.7},
## url = {https://CRAN.R-project.org/package=dplyr},
## }
citation("lubridate")
##
## To cite lubridate in publications use:
##
## Garrett Grolemund, Hadley Wickham (2011). Dates and Times Made Easy
## with lubridate. Journal of Statistical Software, 40(3), 1-25. URL
## https://www.jstatsoft.org/v40/i03/.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {Dates and Times Made Easy with {lubridate}},
## author = {Garrett Grolemund and Hadley Wickham},
## journal = {Journal of Statistical Software},
## year = {2011},
## volume = {40},
## number = {3},
## pages = {1--25},
## url = {https://www.jstatsoft.org/v40/i03/},
## }
citation("sqldf")
##
## To cite package 'sqldf' in publications use:
##
## G. Grothendieck (2017). sqldf: Manipulate R Data Frames Using SQL. R
## package version 0.4-11. https://CRAN.R-project.org/package=sqldf
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {sqldf: Manipulate R Data Frames Using SQL},
## author = {G. Grothendieck},
## year = {2017},
## note = {R package version 0.4-11},
## url = {https://CRAN.R-project.org/package=sqldf},
## }
##
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
citation("ggplot2")
##
## To cite ggplot2 in publications, please use:
##
## H. Wickham. ggplot2: Elegant Graphics for Data Analysis.
## Springer-Verlag New York, 2016.
##
## A BibTeX entry for LaTeX users is
##
## @Book{,
## author = {Hadley Wickham},
## title = {ggplot2: Elegant Graphics for Data Analysis},
## publisher = {Springer-Verlag New York},
## year = {2016},
## isbn = {978-3-319-24277-4},
## url = {https://ggplot2.tidyverse.org},
## }
citation("cowplot")
##
## To cite package 'cowplot' in publications use:
##
## Claus O. Wilke (2020). cowplot: Streamlined Plot Theme and Plot
## Annotations for 'ggplot2'. R package version 1.1.1.
## https://CRAN.R-project.org/package=cowplot
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {cowplot: Streamlined Plot Theme and Plot Annotations for 'ggplot2'},
## author = {Claus O. Wilke},
## year = {2020},
## note = {R package version 1.1.1},
## url = {https://CRAN.R-project.org/package=cowplot},
## }
citation("wordcloud")
##
## To cite package 'wordcloud' in publications use:
##
## Ian Fellows (2018). wordcloud: Word Clouds. R package version 2.6.
## https://CRAN.R-project.org/package=wordcloud
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {wordcloud: Word Clouds},
## author = {Ian Fellows},
## year = {2018},
## note = {R package version 2.6},
## url = {https://CRAN.R-project.org/package=wordcloud},
## }
##
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
citation("ggwordcloud")
##
## To cite package 'ggwordcloud' in publications use:
##
## Erwan Le Pennec and Kamil Slowikowski (2019). ggwordcloud: A Word
## Cloud Geom for 'ggplot2'. R package version 0.5.0.
## https://CRAN.R-project.org/package=ggwordcloud
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {ggwordcloud: A Word Cloud Geom for 'ggplot2'},
## author = {Erwan {Le Pennec} and Kamil Slowikowski},
## year = {2019},
## note = {R package version 0.5.0},
## url = {https://CRAN.R-project.org/package=ggwordcloud},
## }
citation("ggmap")
##
## To cite ggmap in publications, please use:
##
## D. Kahle and H. Wickham. ggmap: Spatial Visualization with ggplot2.
## The R Journal, 5(1), 144-161. URL
## http://journal.r-project.org/archive/2013-1/kahle-wickham.pdf
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## author = {David Kahle and Hadley Wickham},
## title = {ggmap: Spatial Visualization with ggplot2},
## journal = {The R Journal},
## year = {2013},
## volume = {5},
## number = {1},
## pages = {144--161},
## url = {https://journal.r-project.org/archive/2013-1/kahle-wickham.pdf},
## }